#This script generates the gsubbed corpora for the leader and executive analyses
#From these, it then generates the target tokens files for the main analyses
#The target tokens files are used for the ALC analyses and consist of the contexts around the target (leader or executive) words
#Note that ONLY the target tokens files are used for the ALC analyses
#The raw text and gsubbed corpora will NOT be included in public replication package
#The script also generates a table of articles per outlet for Appendix Table B1
library(dplyr)
library(fst)
library(conText)
library(data.table)
library(dplyr)
library(readr)
library(quanteda)
library(xtable)
source("utils.R")
set.seed(123L)

# Define the list of countries
countries <-
  c("djazairess", "maghress", "masress", "sauress", "turess")

# Function to replace leader names with TARGETWORD in the corpus
process_leaders <- function(country_name) {
  # Read the corresponding analysis corpus
  nws_corpus <-
    readRDS(paste0("data/analysis_corpora/", country_name, "_nws_corpus.rds"))
  
  nws_corpus <- nws_corpus %>%
    sample_n(5e5)
  
  # Filter the leaders for the specific country
  leaders_df <-
    read.csv("data/other/leaders.csv", stringsAsFactors = FALSE) %>%
    filter(country == country_name)
  
  # Converting the start and end dates to Date format
  leaders_df$start <- as.Date(leaders_df$start, format = "%d/%m/%Y")
  leaders_df$end <- as.Date(leaders_df$end, format = "%d/%m/%Y")
  
  # Looping through each row in the news data frame
  for (i in 1:nrow(nws_corpus)) {
    # Get the article's date
    article_date <- nws_corpus$date[i]
    # Looping through each row in the leaders data frame
    for (j in 1:nrow(leaders_df)) {
      # Check if the article's date is within the start and end dates for the leader
      if (article_date >= leaders_df$start[j] &&
          article_date <= leaders_df$end[j]) {
        # Replace the leader's name with "TARGETWORD" in the content column
        nws_corpus$content[i] <-
          gsub(leaders_df$name[j], "TARGETWORD", nws_corpus$content[i])
        leader_name <- leaders_df$name[j]
        nws_corpus$leader_name[i] <- leader_name
      }
    }
  }
  
  # Save the modified corpus to the analysis_corpora folder
  saveRDS(
    nws_corpus,
    paste0(
      "data/analysis_corpora_gsubbed/",
      country_name,
      "_nws_corpus_leadertarg.rds"
    )
  )
}

# Loop through countries and apply the process_leaders function
lapply(countries, process_leaders)


# Function to replace executive names with TARGETWORD in the corpus
process_exec <- function(country_name) {
  # Read the corresponding analysis corpus
  nws_corpus <-
    readRDS(paste0("data/analysis_corpora/", country_name, "_nws_corpus.rds"))
  
  nws_corpus <- nws_corpus %>%
    sample_n(5e5)
  
  # Filter the leaders for the specific country
  exec_df <-
    read.csv("data/executive.csv", stringsAsFactors = FALSE) %>%
    filter(country == country_name)
  
  # Looping through each row in the news data frame
  for (i in 1:nrow(nws_corpus)) {
    # Looping through each row in the leaders data frame
    for (j in 1:nrow(exec_df)) {
      # Replace the leader's name with "TARGETWORD" in the content column
      nws_corpus$content[i] <-
        gsub(exec_df$name[j], "TARGETWORD", nws_corpus$content[i])
    }
  }
  
  # Save the modified corpus to the analysis_corpora folder
  saveRDS(
    nws_corpus,
    paste0(
      "data/analysis_corpora_gsubbed/",
      country_name,
      "_nws_corpus_exectarg.rds"
    )
  )
}

# Loop through countries and apply the process_leaders function
lapply(countries, process_exec)


# Get table of articles per outlet:

all_country_results <- list()
analysis_corpora_path <- "data/analysis_corpora/"

# Loop through all countries
for (country in countries) {
  cat(paste0("Processing ", country, " ...\n"))
  
  # Load the news corpus for each country
  nws_corpus <- readRDS(paste0(analysis_corpora_path, country, "_nws_corpus.rds"))
  
  # Group by newspaper source and count the articles
  country_result <- nws_corpus %>%
    group_by(newspaper) %>% 
    summarize(num_articles = n()) %>%
    arrange(-num_articles) %>%
    mutate(country = country) # adding country column to the result
  
  # Store the result in the list
  all_country_results[[country]] <- country_result
}

# Combine all the results together
combined_results <- bind_rows(all_country_results)

total_articles <- sum(combined_results$num_articles)

# Create Appendix table B1
x.big <- xtable(combined_results, digits= 0)
print(x.big, tabular.environment = "longtable", floating = F)


# Generate target tokens files for main analyses
# Tokenize the corpus, assign document IDs, and extract the target contexts
generate_target_tokens <- function(corpus_file, output_path,
                                   target = "TARGETWORD",
                                   window = 12L,
                                   valuetype = "fixed",
                                   hard_cut = FALSE,
                                   case_insensitive = TRUE,
                                   remove_punct = FALSE, 
                                   remove_symbols = FALSE, 
                                   remove_numbers = FALSE, 
                                   remove_separators = FALSE) {
  # Read the modified corpus (assumes it has a 'content' column)
  nws_corpus <- readRDS(corpus_file)
  
  # Assign a unique document ID to each article
  nws_corpus$doc_id <- paste0("ART", seq_len(nrow(nws_corpus)))
  
  quant_corpus <- corpus(nws_corpus, text_field = "content", docid_field = "doc_id")
  
  # Re-add the 'content' column as a docvar so it survives tokenization
  docvars(quant_corpus, "content") <- nws_corpus$content
  docvars(quant_corpus, "doc_id") <- nws_corpus$doc_id
  
  # Tokenize the corpus
  toks <- tokens(quant_corpus,
                 remove_punct = remove_punct,
                 remove_symbols = remove_symbols,
                 remove_numbers = remove_numbers,
                 remove_separators = remove_separators)
  
  # Extract tokens of the context around the target word
  target_toks <- tokens_context(x = toks,
                                pattern = target,
                                valuetype = valuetype,
                                window = window,
                                hard_cut = hard_cut,
                                case_insensitive = case_insensitive)
  
  # Save the resulting tokens
  saveRDS(target_toks, output_path)
  cat("Saved target tokens to", output_path, "\n")
}

# Define the destination path for tokens objects
destination_path <- "data/analysis_toks/"
if (!dir.exists(destination_path)) {
  dir.create(destination_path, recursive = TRUE)
}

# Loop through countries for leader-modified corpora
for (country in countries) {
  corpus_file_leader <- paste0("data/analysis_corpora_gsubbed/", country, "_nws_corpus_leadertarg.rds")
  target_tokens_file_leader <- paste0(destination_path, country, "_target_toks_leader.rds")
  
  generate_target_tokens(corpus_file_leader, target_tokens_file_leader,
                         target = "TARGETWORD",
                         window = 12L)
}

# Loop through countries for executive-modified corpora
for (country in countries) {
  corpus_file_exec <- paste0("data/analysis_corpora_gsubbed/", country, "_nws_corpus_exectarg.rds")
  target_tokens_file_exec <- paste0(destination_path, country, "_target_toks_exec.rds")
  
  generate_target_tokens(corpus_file_exec, target_tokens_file_exec,
                         target = "TARGETWORD",
                         window = 12L)
}